Data formats

Creating dataframes

Making a df from files in folder

library(tidyverse)

#creating the dir_to_df function
dir_to_df <- function(files) {
  files <- gsub("^./","",files)
  df <- data.frame(column= unlist(files)) %>% 
    separate(column, sep="/", into=c("site","image"))
}
my_df <- dir_to_df(files)

Making a df from vectors

# vectors
employee <- c('John Doe','Peter Gynn','Jolie Hope') #character vector
salary <- c(21000, 23400, 26800) #numeric vector
startdate <- as.Date(c('2010-11-1','2008-3-25','2007-3-14')) #date vector
#combine into df
employ.data<-data.frame(employee,salary,startdate)
str(employ.data)
## 'data.frame':    3 obs. of  3 variables:
##  $ employee : chr  "John Doe" "Peter Gynn" "Jolie Hope"
##  $ salary   : num  21000 23400 26800
##  $ startdate: Date, format: "2010-11-01" "2008-03-25" ...
employ.data2<-data.frame(employee,salary,startdate,stringsAsFactors=TRUE) #employee names become factors

Dataframe manipulations

Accessing data

#like a list
employ.data["employee"] #returns a df
##     employee
## 1   John Doe
## 2 Peter Gynn
## 3 Jolie Hope
employ.data$employee #returns a vector
## [1] "John Doe"   "Peter Gynn" "Jolie Hope"
employ.data[["employee"]] #returns a vector
## [1] "John Doe"   "Peter Gynn" "Jolie Hope"
employ.data[[1]] #returns a vector
## [1] "John Doe"   "Peter Gynn" "Jolie Hope"
#like a matrix 
head(employ.data,n=2) #display two rows
##     employee salary  startdate
## 1   John Doe  21000 2010-11-01
## 2 Peter Gynn  23400 2008-03-25
employ.data[2:3,] #display row 2 & 3
##     employee salary  startdate
## 2 Peter Gynn  23400 2008-03-25
## 3 Jolie Hope  26800 2007-03-14
employ.data[employ.data$salary > 21000,]
##     employee salary  startdate
## 2 Peter Gynn  23400 2008-03-25
## 3 Jolie Hope  26800 2007-03-14
employ.data[1:2,2] #as vector
## [1] 21000 23400
employ.data[1:2,2, drop = FALSE] #as df
##   salary
## 1  21000
## 2  23400

Modifying 1 value

employ.data[1,"salary"] <- 30; employ.data
##     employee salary  startdate
## 1   John Doe     30 2010-11-01
## 2 Peter Gynn  23400 2008-03-25
## 3 Jolie Hope  26800 2007-03-14

Turn NAs into 0

mydata[is.na(mydata)] = 0

Adding columns/rows

employ.data2<-rbind(employ.data,list("Lea Katz",35000,"2021-10-01")) #adds a row
employ.data3<-cbind(employ.data2,age=c(35,45,27,26)) #adds a column
employ_data3<-employ.data2$age<-c(35,45,27,26) #another way to add a column

Removing columns/rows

employ.data3$age <- NULL #removes column
employ_data5 <- employ.data2[-1,] #removes first row

Change colnames/rownames

#baseR
"start" -> colnames(employ.data)[3]
#dplyr
iris %>% rename(petal_length = Petal.Length)
#from a csv... sometimes not well read
area<-header.true(area) #makes first column into title and removes the auto-generated ones

Creating subsets

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#filter with a condition
employ.subset<-subset(employ.data,subset = salary==21000 )
#select only some variables
employ.data %>% select(salary,employee)
##   salary   employee
## 1     30   John Doe
## 2  23400 Peter Gynn
## 3  26800 Jolie Hope

Find value and change it

data<-as.matrix(data)
m <- data[,-1]
rownames(m) <- data[,1]

for (x in 1:nrow(sum)) {
  i <- sum$image_filename[x]
  j <- sum$label_names[x]
  m[i,j]<-sum$cover[x]
}

Group by a variable and only keep last value

tlog<-aggregate(tlog,by=list(as.character(tlog$unix)), FUN=last)

Group by a variable and make a sum

# for 1 variable
sum<- area %>% 
  group_by(image_filename, label_names) %>% 
  summarize(cover = sum(cover))
# for all variables (if numeric)
summed<-mydata %>% 
  group_by(region.code) %>% 
  summarise(across(everything(),sum,na.rm=TRUE))

not sure what this does anymore…

names(images)<-as.character(unlist(images[1,]))

Create new columns with operations

library(dplyr)
employ.data %>% mutate(double_salary = salary*2)
##     employee salary  startdate double_salary
## 1   John Doe     30 2010-11-01            60
## 2 Peter Gynn  23400 2008-03-25         46800
## 3 Jolie Hope  26800 2007-03-14         53600

Merging 2 dataframes

Binding columns (rows match)

employ.info<-cbind(employ.data,personal.info)
#OR
employ.info<-bind_cols(employ.data,personal.info)

Matching values for common variables

mydata<-inner_join(mydata,metadata, by="transect")

Creating Lists

Making a list from df and vectors

age_vec<-c(35,45,27,26)
mylist<-list(employees = employ.data, ages = age_vec)

Vectors

Change “numerical vector” into “named numerical vector”

ages<-c(35,45,27,26)
names<-employ.data$employee
names(ages)<-names #gives names to numerical values of the vector

Simple things

Paste

#in a string
dive<-"20220601-DIV1"
campaign<-"zeeland"
my_path = paste0("data/",campaign,"/",dive,"/images")

#in a function 
file<-read.csv(paste0("tlog/csv-files/",dive,"-gps.csv"))

Plots

Spatial data

Position points

ggplot(graham_ind,aes(date,tree_cover)) +
  geom_point() +
  theme_light()

Tracked position

ggplot(graham_ind,aes(longitude,latitude)) +
  geom_path() #in order of dataset, use geom_line for ascending order

Full positioning plot with all the labels and titles

ggplot(graham_ind,aes(longitude,latitude)) +
  #geom_point() + # order is important, best if points are on top of path
  geom_path(color="grey67") +
  geom_point(color="pink") +
  geom_rug(sides="rb", alpha = .05) + #rb=right+bottom, alpha=transparency
  #xlab("Longitude") +
  #ylab("Latitude")  + #labels
  #ggtitle("Trajectory of Caribou C12") but "labs()" is better
  labs(
    x = "Longitude", #use NULL if you don't want any label
    y = "Latitude",
    title = "Trajectory of Caribou C12",
    subtitle = "Map of 2,562 locations of female C12 tracked between 2001 and 2003 in Graham, Canada",
    caption = "Data: B.C. Ministry of Environment & Climate Change",
    tag = "Fig. 1"
  )

Coordinate system

#linear and non-linear
ggplot(graham_ind,aes(longitude, latitude)) +
  geom_path() +
coord_cartesian(      
    ylim = c(56.8, NA) #you can set the min and max values of both axis
  )

#THIS WILL NOT REMOVE POINTS OUTSIDE THE PLOT AREA, just zoom
#if you want to really remove the points do like this :
ggplot(graham_ind,aes(longitude, latitude)) +
  geom_path() +
  scale_y_continuous(limits = c(56.8, NA))
## Warning: Removed 823 row(s) containing missing values (geom_path).

#another features
ggplot(graham_ind,aes(longitude, latitude)) +
  geom_point() +
  coord_cartesian(
    expand = FALSE, #to have à (0,0) origin
    clip = "off",         #so points/labels don't get cut on the axis lines
    ylim = c(56.8, NA) 
  )

#coord_fixed
ggplot(graham_ind,aes(longitude, latitude)) +
  geom_point() +
  coord_fixed(
    ratio = 5  #default is 1 if you put nothing
  ) +
  scale_y_continuous(
    breaks = seq(56,58, by = .04)
  )

#coord_polar : makes straight lines into curves! 
ggplot(graham_ind,aes(longitude, latitude)) +
  geom_path() +
  coord_polar()

Statistics

Boxplot

ggplot(graham_ind,aes(season,tree_cover)) +
  geom_boxplot() +
  theme_linedraw()

Assigning, saving, rendering, etc.

Assign plot to object

#g<-ggplot(graham_ind, aes(longitude,latitude)) +
  #geom_point()
#g
(g<-ggplot(graham_ind, aes(longitude,latitude)) +
    geom_point()) #to print it directly !! use parentheses
g +
  geom_path(color="purple") +
  geom_rug(side="rb",alpha= .08) # you can add layers to your plot object later... can aboid typos if you cant to change things?

Save

ggsave(filename="myggplot.png", # raster graphic
       width = 10, height = 7,
       dpi = 700) # pixel intensity to 600 or higher!

ggsave(filename="myggplot.pdf", #vector graphic => best
       width = 10, height = 7,
       device=cairo_pdf) # doesn't work?

Different types of plots

Discrete variable

ggplot(graham_ind,aes(
  longitude, latitude,
  color = season, #if color is a variable in the dataset, put inside "aes()"
  shape = season
  )) + 
  geom_point()

Continuous variable

ggplot(graham_ind,aes(
  longitude, latitude,
  color = tree_cover)) + 
  geom_point()

ggplot(graham_ind,aes(
  longitude, latitude,
  size = tree_cover
  )) + 
  geom_point(
    color = "darkgreen", #should not go in aesthetics because it is for the geom_points
    shape = 1 
  )

Scales

ggplot(graham_ind,aes(
  date, tree_cover,
  color = season)) + 
  geom_point() +
  scale_x_date() + #tells ggplot that the x axis is a date
  scale_y_continuous() +   #tells ggplot that the y axis is continuous
  scale_color_discrete()

ggplot(graham_ind,aes(
  date, tree_cover,
  color = season)) + 
  geom_point() +
  scale_x_date(
    expand = c(0,0), #removes the "padding" on the sides)
    date_breaks = "3 months", #can use english to specify date breaks
    date_labels = "%m/%y", #to shorten the dates to desired abbreviation
    name = NULL
  ) +
  scale_y_continuous(
    labels = scales::percent_format(scale=1), #our numbers were already 75, but if it was 0.75, we could do scale = 100
    sec.axis = dup_axis(name = NULL), #duplicate the y axis for easier reading
    name = "Tree Cover"
  ) +   
  scale_color_discrete(
    type = c("#AE79F0","#EEAF5C"), #either use normal color names or hexcodes
    name = "Season:"
  )

Color gradients

ggplot(graham_ind,
       aes(longitude,latitude,
           color = yday)) +
  geom_point () +
  scale_color_continuous (
    type = "viridis" # with existing palette, install more palette packages :)
  )

ggplot(graham_ind,
       aes(longitude,latitude,
           color = yday)) +
  geom_point () +
  scale_color_gradient (
    low = "#AE79F0", high = "#EEAF5C" # custom gradient
  )

heatmap

ggplot(graham_ind,aes(longitude,latitude,)) +
  geom_hex (
    aes(fill=..count..), # is useful to put inside the geom if there are several geoms
    bins = 15,
    color = "white", #white outlines draws your eye to global extremes
    size = 0.5 
  ) +
  scale_fill_continuous(type = "viridis") +
  scale_color_continuous(type = "viridis", guide = "none") +
  labs (
    x = "Longitude",
    y = "Latitude",
    title = "Heatmap of C12",
    caption = "Data source : blablabla"
  )
## Warning: Computation failed in `stat_binhex()`:

heatmap without the outlines

ggplot(graham_ind,aes(longitude,latitude,)) +
  geom_hex (aes(color =..count..), bins = 15) +
  scale_fill_continuous(type = "viridis",
                        name = "Count:",
                        #breaks = c(50,100,150,200,250)) 
                        #breaks = seq(50, 250, by = 50))
                        breaks = 1:5*50 #from 1 to 5 and multiply each by 50
                        ) +
  scale_color_continuous(type = "viridis", guide = "none") +
  labs (
    x = "Longitude",
    y = "Latitude",
    title = "Heatmap of C12",
    caption = "Data source : blablabla"
  )
## Warning: Computation failed in `stat_binhex()`:

#this is another way to do it  
ggplot(graham_ind,aes(longitude,latitude,)) +
  geom_hex (bins = 15, color = "white", size = 1) +  #bins = resolution => always try several
  rcartocolor::scale_fill_carto_c(palette="SunsetDark",
                                  name = "Observations:",
                                  breaks = 1:5*50)
## Warning: Computation failed in `stat_binhex()`:

Mutliple plots together (facets)

g <- ggplot(graham_ind,aes(longitude,latitude)) +
  geom_path(color="grey67") +
  geom_point(aes(color=factor(year))) + #aes here only colors the points!
  scale_color_discrete(
    name = "Year",
    #labels = c("year 2001", "year 2002", "year 2003")
    labels= function(x) paste("Year", x))
g + facet_wrap(~ season) #tilde ~ is option + N

g + facet_wrap(~factor(year)) #to add ylabs to mini-plots again => lemon package

to arrange the plots differently

g + 
  facet_wrap(
    ~ factor(year),
    ncol = 2
    )

#two pariables => GRID
g + 
  facet_grid(
    factor(year) ~ season
  )

Aestethics

Color palettes

display.brewer.all(type="seq") # "Sequential" : from light to dark
display.brewer.all(type="div") # "Diverging" : dark-light-dark
display.brewer.all(type="qual") # "Qualitative" : visual differences between groups
display.brewer.all(colorblindFriendly=TRUE)

Using a palette in ggplot

ggplot(graham_ind,aes(longitude,latitude,color=factor(year))) +
  geom_path(color="grey67") +
  geom_point() + #aes here only colors the points!
  scale_color_brewer(
    palette="YlOrRd",
    name = "Year",
    #labels = c("year 2001", "year 2002", "year 2003"),
    labels = function(x) paste("Year", x)
  )

Maps

Mapview

caribou <- st_as_sf(graham_ind, coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)
mapview(caribou)